BED file

library(ggplot2)
library(rafalib)
df <- read.table("~/data/molly/modified_bases.6mA.bed")
colnames(df) <- c("chrom", "start", "end", "name", "score", "strand", "startCodon", "stopCodon", "color", "coverage", "percentage")

hist(df$coverage, breaks=400, xlim=c(0,1000))

df <- df[df$coverage > 10,]

plot(df$percentage[1:1e4], pch=16, cex=1/2)

dfHML <- df[df$chrom == "III" & df$start > 1 & df$end < 2.2e4,]
plot(x=dfHML$start, y=dfHML$percentage, pch=16,main="HML region", cex=dfHML$coverage/100,
     xlab="Position on chr3", ylab="Percentage of methylated reads")

dfHMR <- df[df$chrom == "III" & df$start > 285e3 & df$end < 300e3,]
plot(x=dfHMR$start, y=dfHMR$percentage, pch=16, main="HMR region", cex=dfHMR$coverage/100,
     xlab="Position on chr3", ylab="Percentage of methylated reads")

dfHMR2 <- df[df$chrom == "III" & df$start > 293000 & df$end < 294000,]
plot(x=dfHMR2$start, y=dfHMR2$percentage, pch=16, main="HMR region", cex=dfHMR2$coverage/100,
     xlab="Position on chr3", ylab="Percentage of methylated reads")

Periodicity visualization: Doudna run

nucDat <- openxlsx::read.xlsx("../../data/HMR_HML_nucleosome positions.xlsx")

HMR

nucDatHMR <- nucDat[nucDat$region == "HMR",]
dfHMRNuc <- dfHMR[dfHMR$start > min(nucDatHMR$Start) & dfHMR$stop < max(nucDatHMR$Stop),]
plot(x=dfHMRNuc$start, y=dfHMRNuc$percentage, cex=log(dfHMRNuc$coverage)/10, pch=16)
abline(v=nucDatHMR$dyad, col="red")

pListHMR <- list()
for(nn in 1:(nrow(nucDatHMR)-1)){
  curStart <- nucDatHMR$dyad[nn]
  curStop <- nucDatHMR$dyad[nn+1]
  curDf <- dfHMRNuc[dfHMRNuc$start>=curStart & dfHMRNuc$end<=curStop,]
  curDf$normPos <- (curDf$start - min(curDf$start))
  curDf$normPos <- curDf$normPos / max(curDf$normPos)
  
  pListHMR[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
    geom_point(col="grey", alpha=.4) +
    geom_smooth(aes(weight=coverage)) +
    theme_bw() +
    ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
    xlab("Dyad distance") +
    ylab("Percentage methylated")

}
cowplot::plot_grid(plotlist=pListHMR)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggsave("../../plots/dyadMethylationHMR_Doudna.pdf",
       width=16, height=10)

##HML

nucDatHML <- nucDat[nucDat$region == "HML",]
dfHMLNuc <- dfHML[dfHML$start > min(nucDatHML$Start) & dfHML$stop < max(nucDatHML$Stop),]
plot(x=dfHMLNuc$start, y=dfHMLNuc$percentage, cex=log(dfHMLNuc$coverage)/10, pch=16)
abline(v=nucDatHML$dyad, col="red")

pListHML <- list()
for(nn in 1:(nrow(nucDatHML)-1)){
  curStart <- nucDatHML$dyad[nn]
  curStop <- nucDatHML$dyad[nn+1]
  curDf <- dfHMLNuc[dfHMLNuc$start>=curStart & dfHMLNuc$end<=curStop,]
  curDf$normPos <- (curDf$start - min(curDf$start))
  curDf$normPos <- curDf$normPos / max(curDf$normPos)
  
  pListHML[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
    geom_point(col="grey", alpha=.4) +
    geom_smooth(aes(weight=coverage)) +
    theme_bw() +
    ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
    xlab("Dyad distance") +
    ylab("Percentage methylated")

}
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$normPos): no non-missing arguments to max; returning -Inf
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$stop): no non-missing arguments to max; returning -Inf
cowplot::plot_grid(plotlist=pListHML)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggsave("../../plots/dyadMethylationHML_Doudna.pdf",
       width=16, height=10)

Periodicity visualization: Turkey run sample 06

dfTurkey <- read.table("~/data/molly/megalodon_output_06/modified_bases.aggregate06.6mA.bed")
colnames(dfTurkey) <- c("chrom", "start", "end", "name", "score", "strand", "startCodon", "stopCodon", "color", "coverage", "percentage")

HMR region

dfHMRNucTurkey <- dfTurkey[dfTurkey$chrom == "III" & dfTurkey$start > min(nucDatHMR$Start) & dfTurkey$stop < max(nucDatHMR$Stop),]
plot(x=dfHMRNucTurkey$start, y=dfHMRNucTurkey$percentage, cex=log(dfHMRNucTurkey$coverage)/10, pch=16)
abline(v=nucDatHMR$dyad, col="red")

pListHMRTurkey <- list()
for(nn in 1:(nrow(nucDatHMR)-1)){
  curStart <- nucDatHMR$dyad[nn]
  curStop <- nucDatHMR$dyad[nn+1]
  curDf <- dfHMRNucTurkey[dfHMRNucTurkey$start>=curStart & dfHMRNucTurkey$end<=curStop,]
  curDf$normPos <- (curDf$start - min(curDf$start))
  curDf$normPos <- curDf$normPos / max(curDf$normPos)
  
  pListHMRTurkey[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
    geom_point(col="grey", alpha=.4) +
    geom_smooth(aes(weight=coverage)) +
    theme_bw() +
    ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
    xlab("Dyad distance") +
    ylab("Percentage methylated")

}
cowplot::plot_grid(plotlist=pListHMRTurkey)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf

ggsave("../../plots/dyadMethylationHMR_turkey06.pdf",
       width=16, height=10)

HML region

dfHMLNucTurkey <- dfTurkey[dfTurkey$chrom == "III" & dfTurkey$start > min(nucDatHML$Start) & dfTurkey$stop < max(nucDatHML$Stop),]
plot(x=dfHMLNucTurkey$start, y=dfHMLNucTurkey$percentage, cex=log(dfHMLNucTurkey$coverage)/10, pch=16)
abline(v=nucDatHMR$dyad, col="red")

pListHMLTurkey <- list()
for(nn in 1:(nrow(nucDatHML)-1)){
  curStart <- nucDatHML$dyad[nn]
  curStop <- nucDatHML$dyad[nn+1]
  curDf <- dfHMLNucTurkey[dfHMLNucTurkey$start>=curStart & dfHMLNucTurkey$end<=curStop,]
  curDf$normPos <- (curDf$start - min(curDf$start))
  curDf$normPos <- curDf$normPos / max(curDf$normPos)
  
  pListHMLTurkey[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
    geom_point(col="grey", alpha=.4) +
    geom_smooth(aes(weight=coverage)) +
    theme_bw() +
    ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
    xlab("Dyad distance") +
    ylab("Percentage methylated")

}
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$normPos): no non-missing arguments to max; returning -Inf
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$stop): no non-missing arguments to max; returning -Inf
cowplot::plot_grid(plotlist=pListHMLTurkey)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggsave("../../plots/dyadMethylationHML_turkey06.pdf",
       width=16, height=10)

Periodicity visualization: Turkey run sample 07

dfTurkey <- read.table("~/data/molly/megalodon_output_07/modified_bases.aggregate07.6mA.bed")
colnames(dfTurkey) <- c("chrom", "start", "end", "name", "score", "strand", "startCodon", "stopCodon", "color", "coverage", "percentage")

HMR region

dfHMRNucTurkey <- dfTurkey[dfTurkey$chrom == "III" & dfTurkey$start > min(nucDatHMR$Start) & dfTurkey$stop < max(nucDatHMR$Stop),]
plot(x=dfHMRNucTurkey$start, y=dfHMRNucTurkey$percentage, cex=log(dfHMRNucTurkey$coverage)/10, pch=16)
abline(v=nucDatHMR$dyad, col="red")

pListHMRTurkey <- list()
for(nn in 1:(nrow(nucDatHMR)-1)){
  curStart <- nucDatHMR$dyad[nn]
  curStop <- nucDatHMR$dyad[nn+1]
  curDf <- dfHMRNucTurkey[dfHMRNucTurkey$start>=curStart & dfHMRNucTurkey$end<=curStop,]
  curDf$normPos <- (curDf$start - min(curDf$start))
  curDf$normPos <- curDf$normPos / max(curDf$normPos)
  
  pListHMRTurkey[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
    geom_point(col="grey", alpha=.4) +
    geom_smooth(aes(weight=coverage)) +
    theme_bw() +
    ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
    xlab("Dyad distance") +
    ylab("Percentage methylated")

}
cowplot::plot_grid(plotlist=pListHMRTurkey)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf

ggsave("../../plots/dyadMethylationHMR_turkey07.pdf",
       width=16, height=10)

HML region

dfHMLNucTurkey <- dfTurkey[dfTurkey$chrom == "III" & dfTurkey$start > min(nucDatHML$Start) & dfTurkey$stop < max(nucDatHML$Stop),]
plot(x=dfHMLNucTurkey$start, y=dfHMLNucTurkey$percentage, cex=log(dfHMLNucTurkey$coverage)/10, pch=16)
abline(v=nucDatHMR$dyad, col="red")

pListHMLTurkey <- list()
for(nn in 1:(nrow(nucDatHML)-1)){
  curStart <- nucDatHML$dyad[nn]
  curStop <- nucDatHML$dyad[nn+1]
  curDf <- dfHMLNucTurkey[dfHMLNucTurkey$start>=curStart & dfHMLNucTurkey$end<=curStop,]
  curDf$normPos <- (curDf$start - min(curDf$start))
  curDf$normPos <- curDf$normPos / max(curDf$normPos)
  
  pListHMLTurkey[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
    geom_point(col="grey", alpha=.4) +
    geom_smooth(aes(weight=coverage)) +
    theme_bw() +
    ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
    xlab("Dyad distance") +
    ylab("Percentage methylated")

}
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$normPos): no non-missing arguments to max; returning -Inf
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$stop): no non-missing arguments to max; returning -Inf
cowplot::plot_grid(plotlist=pListHMLTurkey)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf

ggsave("../../plots/dyadMethylationHML_turkey07.pdf",
       width=16, height=10)

Extract single reads and methylation probabilities

HMR region

library(Rsamtools)
## Loading required package: GenomeInfoDb
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
# file below is the mod_mappings.bam file on Savio, sorted using 'samtools sort'
bamPath <- "~/data/molly/mod_mappings.sorted.bam" 
bamFile <- BamFile(bamPath)
bamFile
## class: BamFile 
## path: /Users/koenvandenberge/data/molly/mod_mappings.sorted.bam
## index: /Users/koenvandenberge/data/molly/mod_mappings.sorted.bam.bai
## isOpen: FALSE 
## yieldSize: NA 
## obeyQname: FALSE 
## asMates: FALSE 
## qnamePrefixEnd: NA 
## qnameSuffixStart: NA
scanBamHeader(bamFile)
## $targets
##       I      II     III      IV      IX      MT       V      VI     VII    VIII 
##  230218  813184  316620 1531933  439888   85779  576874  270161 1090940  562643 
##       X      XI     XII    XIII     XIV      XV     XVI 
##  745751  666816 1078177  924431  784333 1091291  948066 
## 
## $text
## $text$`@HD`
## [1] "VN:1.4"        "SO:coordinate"
## 
## $text$`@SQ`
## [1] "SN:I"      "LN:230218"
## 
## $text$`@SQ`
## [1] "SN:II"     "LN:813184"
## 
## $text$`@SQ`
## [1] "SN:III"    "LN:316620"
## 
## $text$`@SQ`
## [1] "SN:IV"      "LN:1531933"
## 
## $text$`@SQ`
## [1] "SN:IX"     "LN:439888"
## 
## $text$`@SQ`
## [1] "SN:MT"    "LN:85779"
## 
## $text$`@SQ`
## [1] "SN:V"      "LN:576874"
## 
## $text$`@SQ`
## [1] "SN:VI"     "LN:270161"
## 
## $text$`@SQ`
## [1] "SN:VII"     "LN:1090940"
## 
## $text$`@SQ`
## [1] "SN:VIII"   "LN:562643"
## 
## $text$`@SQ`
## [1] "SN:X"      "LN:745751"
## 
## $text$`@SQ`
## [1] "SN:XI"     "LN:666816"
## 
## $text$`@SQ`
## [1] "SN:XII"     "LN:1078177"
## 
## $text$`@SQ`
## [1] "SN:XIII"   "LN:924431"
## 
## $text$`@SQ`
## [1] "SN:XIV"    "LN:784333"
## 
## $text$`@SQ`
## [1] "SN:XV"      "LN:1091291"
## 
## $text$`@SQ`
## [1] "SN:XVI"    "LN:948066"
## 
## $text$`@RG`
## [1] "ID:1"      "SM:SAMPLE"
seqinfo(bamFile)
## Seqinfo object with 17 sequences from an unspecified genome:
##   seqnames seqlengths isCircular genome
##   I            230218       <NA>   <NA>
##   II           813184       <NA>   <NA>
##   III          316620       <NA>   <NA>
##   IV          1531933       <NA>   <NA>
##   IX           439888       <NA>   <NA>
##   ...             ...        ...    ...
##   XII         1078177       <NA>   <NA>
##   XIII         924431       <NA>   <NA>
##   XIV          784333       <NA>   <NA>
##   XV          1091291       <NA>   <NA>
##   XVI          948066       <NA>   <NA>
grHMR <- GRanges(seqnames = "III", 
              ranges = IRanges(start = 285e3, end = 300e3))
sbp <- ScanBamParam(which = grHMR, what = c("qname", # read ID
                                            "seq", # sequence
                                            "pos")) # read position (start)
aln <- scanBam(bamFile, param = sbp)

customParam <- ScanBamParam(which = grHMR,
                            tag=c("Mm", "Ml"), 
                            what=c("flag", #custom tag (methylation)
                                   "qname", # read ID
                                   "seq", # sequence
                                   "pos")) #start position
aln <- scanBam(bamFile, which=grHMR, param = customParam)


bases <- aln$`III:285000-300000`$seq
methylTag <- aln$`III:285000-300000`$tag$Mm
methylProb <- aln$`III:285000-300000`$tag$Ml

# methylation probabilities do seem to be very much read dependent
rafalib::mypar(mfrow=c(4,4))
hlp <- lapply(methylProb[1:112], hist, breaks=40, xlim=c(0,255))

# See https://github.com/jkbonfield/hts-specs/blob/methylation/SAMtags.tex 
# for explanation on how the Mm and Ml tags work.


posProbList <- list()
for(rr in 1:length(aln$`III:285000-300000`$seq)){
  # loop over each read
  curSeq <- aln$`III:285000-300000`$seq[rr][[1]]
  curStart <- aln$`III:285000-300000`$pos[rr]
  curTag <- aln$`III:285000-300000`$tag$Mm[rr]
  curProb <- aln$`III:285000-300000`$tag$Ml[[rr]]
  
  # extract A bases and corresponding positions
  # start at starting position, and total number of positions equal total number of As in sequence.
  posA <- curStart + which(isMatchingAt("A", curSeq, at = 1:length(curSeq))) - 1
  
  # get at which As are considered possibly methylated
  tagSplit <- gsub(pattern=";", x=strsplit(curTag, split=",")[[1]], replacement="")
  stopifnot(tagSplit[1] == "A+a") # check if we're looking at m6A
  tagSplit <- as.numeric(tagSplit[-1])
  
  # number of As considered for methylation should be equal to length of probability vector
  stopifnot((sum(tagSplit == 0) + sum(tagSplit > 0)) == length(curProb))
  
  #get pos and prob of As considered potentially methylated
  posMet <- posA[cumsum(tagSplit+1)] 
  probMet <- curProb / 255
  df <- data.frame(pos = posMet,
                   prob = probMet)
  # #not considered methylated: doesn't work yet
  # posNull <- posA[-cumsum(tagSplit+1)]
  # probNull <- rep(0, length(posNull))
  # df <- data.frame(pos = c(posMet, posNull), 
  #                  prob = c(probMet, probNull)) # positions and methyl probs
  posProbList[[rr]] <- df
}

rafalib::mypar(mfrow=c(1,1))
pal <- wesanderson::wes_palette("Zissou1", n=256, type="continuous")
par(bty="l")
plot(x=seq(285e3, 300e3, length=length(posProbList)),
     y=1:length(posProbList),
     type = "n",
     xlab = "Position on ChrIII",
     ylab= "Read colored by methylation probability")
for(ii in 1:length(posProbList)){
  curPos <- posProbList[[ii]]$pos
  points(x=curPos, y=rep(ii, length(curPos)), col=pal[posProbList[[ii]]$prob*255],
         pch=16, cex=1/20)
}
abline(v=292e3, lty=2)
abline(v=295e3, lty=2)

Lower and upper thresholds

for(tt in 0:7){
  binaryPal <- wesanderson::wes_palette("Zissou1", n=2, type="continuous")
  rafalib::mypar(mfrow=c(1,1))
  #pal <- wesanderson::wes_palette("Zissou1", n=256, type="continuous")
  par(bty="l")
  plot(x=seq(285e3, 300e3, length=length(posProbList)),
       y=1:length(posProbList),
       type = "n",
       xlab = "Position on ChrIII",
       ylab= "Read colored by methylation probability",
       main = paste0("Lower: ",(0.4-(tt*0.05))*100,"%, upper: ", (0.6+(tt*0.05))*100,"%"))
  for(ii in 1:length(posProbList)){
    curDf <- posProbList[[ii]]
    curDf <- curDf[curDf$prob >= 0.6+(tt*0.05) | curDf$prob <= 0.4-(tt*0.05),]
    curPos <- curDf$pos
    curMet <- as.numeric(curDf$prob >= .8)
    points(x=curPos, y=rep(ii, length(curPos)), col=binaryPal[curMet+1],
           pch=16, cex=1/20)
  }
  abline(v=292e3, lty=2)
  abline(v=295e3, lty=2)
}